library(dplyr)
library(data.table)
library(forcats)
library(glmnet)
library(pROC)
library(knitr)
library(ggplot2)
library(plotly)Project Part 3
Group 9
Loading libraries
Loading dataset
dataset <- fread("SMARTc.csv", sep = ";")Re-encode the categorical variables
dataset <- mutate(dataset,
EVENT = factor(EVENT),
EVENT = fct_recode(EVENT, "no" = "0", "yes" = "1"),
SEX = factor(SEX),
SEX = fct_recode(SEX, "male" = "1", "female" = "2"),
DIABETES = factor(DIABETES),
DIABETES = fct_recode(DIABETES, "no" = "0", "yes" = "1"),
SMOKING = factor(SMOKING),
SMOKING = fct_recode(SMOKING, "never" = "1", "former" = "2", "current" = "3"),
alcohol = factor(alcohol),
alcohol = fct_recode(alcohol, "never" = "1", "former" = "2", "current" = "3"),
CEREBRAL = factor(CEREBRAL),
CEREBRAL = fct_recode(CEREBRAL, "no" = "0", "yes" = "1"),
CARDIAC = factor(CARDIAC),
CARDIAC = fct_recode(CARDIAC, "no" = "0", "yes" = "1"),
AAA = factor(AAA),
AAA = fct_recode(AAA, "no" = "0", "yes" = "1"),
PERIPH = factor(PERIPH),
PERIPH = fct_recode(PERIPH, "no" = "0", "yes" = "1"),
albumin = factor(albumin),
albumin = fct_recode(albumin, "no" = "1", "micro" = "2", "macro" = "3"),
STENOSIS = factor(STENOSIS),
STENOSIS = fct_recode(STENOSIS, "no" = "0", "yes" = "1"),
)Lasso regression model
set.seed(123)
# p.278
formula <- EVENT ~ AGE + SEX + BMI + SYSTH + HDL + DIABETES +
+HOMOC + log(CREAT) + STENOSIS + IMT + SMOKING +
alcohol + albumin
# Splitting the data
train_sample <- sample(1:nrow(dataset), nrow(dataset) * 0.7)
train_data <- dataset[train_sample, ]
test_data <- dataset[-train_sample, ]
# with grid search for finetuning
grid <- 10 ^ seq(10, -2, length = 100)
lasso.mod <- cv.glmnet(
x = model.matrix(formula, data = train_data)[, -1],
y = as.numeric(train_data$EVENT) - 1,
alpha = 1,
family = "binomial",
nfolds = 10,
lambda = grid
)
cv.out <- cv.glmnet(
x = model.matrix(formula, data = train_data)[, -1],
y = as.numeric(train_data$EVENT) - 1,
alpha = 1,
family = "binomial",
nfolds = 10,
lambda = grid
)
best_lambda <- cv.out$lambda.min
lasso.pred <- predict(lasso.mod,
s = best_lambda,
newx = model.matrix(formula, data = test_data)[, -1], type = "response"
)
# Mean squared error
mean((lasso.pred - (as.numeric(test_data$EVENT) - 1))^2)[1] 0.1023319
plot(cv.out)Performance of the model
roc_lasso <- roc(test_data$EVENT, lasso.pred)
auc_lasso <- auc(roc_lasso)display code to plot the ROC curve
roc_data <- data.frame(
Fold = paste("Fold", 1),
Sensitivity = roc_lasso$sensitivities,
Specificity = 1 - roc_lasso$specificities
)
roc_plot <- ggplot(roc_data, aes(x = Specificity, y = Sensitivity)) +
geom_line(linewidth = 0.8, color = "#3459E6") +
geom_segment(x = 0, xend = 1, y = 0, yend = 1, linetype = "dashed", color = "gray") +
theme_minimal(base_size = 14) +
labs(
title = "ROC Curve for Lasso Model",
x = "1 - Specificity",
y = "Sensitivity"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 15, face = "bold"),
legend.position = "none"
) +
coord_equal()
ggplotly(roc_plot)print(auc_lasso)Area under the curve: 0.7096
We found an AUC of around 0.71 for the Lasso model. This means that the model is able to distinguish between patients with and without an event 71% of the time.
lasso_coefficients <- coef(cv.out, s = "lambda.min")
lasso_coefficients_df <- as.data.frame(as.matrix(lasso_coefficients))
lasso_coefficients_df$variable <- rownames(lasso_coefficients_df)
colnames(lasso_coefficients_df) <- c("coefficient", "variable")
# Filter out variables with non-zero coefficients
important_variables <- subset(lasso_coefficients_df, coefficient != 0)
kable(important_variables)| coefficient | variable | |
|---|---|---|
| (Intercept) | -5.8768329 | (Intercept) |
| AGE | 0.0270126 | AGE |
| SYSTH | 0.0013264 | SYSTH |
| HDL | -0.6811032 | HDL |
| DIABETESyes | 0.0983183 | DIABETESyes |
| HOMOC | 0.0405386 | HOMOC |
| log(CREAT) | 0.3505297 | log(CREAT) |
| STENOSISyes | 0.4143148 | STENOSISyes |
| IMT | 0.4337308 | IMT |
| SMOKINGformer | 0.1096286 | SMOKINGformer |
| SMOKINGcurrent | -0.1489430 | SMOKINGcurrent |
| alcoholcurrent | -0.0481888 | alcoholcurrent |
| albuminmacro | 0.5830501 | albuminmacro |
We see that some variables have positive coefficiens, while others have negative coefficients. This means that the model considers some variables as positively correlated with the event (albuminmacro, IMT, STENOSISyes, log(CREAT), DIABETESyes, SMOKINGformer), while others are negatively correlated (HDL, SMOKINGcurrent, alcoholcurrent). Also, some variables like AGE or HOMOC have smaller coefficients, but this can be due to the unit of the variable (e.g. AGE is in years, HOMOC is in µmol/L).
Stepwise backward variable selection
stepwise_backward <- step(
glm(
formula,
data = train_data,
family = "binomial"
),
direction = "backward"
)
stepwise_backward_pred <- predict(
stepwise_backward,
newdata = test_data,
type = "response"
)